library(ngsReports)
library(magrittr)
library(scales)
library(pander)
library(tidyverse)
if (interactive()) setwd(here::here("R"))
rawFqc <- list.files("../0_rawData/FastQC/", pattern = "zip", full.names = TRUE) %>%
    FastqcDataList()
trimFqc <- list.files("../1_trimmed/FastQC/", pattern = "zip", full.names = TRUE) %>%
    FastqcDataList()
deMuxFqc <- list.files("../2_demux/FastQC/", pattern = "zip", full.names = TRUE) %>%
    FastqcDataList()
origFqc <- list.files("../previousAnalysis/FastQC/", pattern = "fastqc.zip", full.names = TRUE) %>%
    FastqcDataList()
samples <- read_tsv("../0_rawData/samples.tsv")

Base Qualities

Each of the libraries was inspected for overall quality. Positions 3 & 4 from R2 libraries FCC21WPACXX-CHKPEI13070002_L6 and FCC21WPACXX-CHKPEI13070003_L7 showed clear problems with read qualities. This was likely due to an unsatisfactory nucleotide diversity in the original sequencing run and not enough phiX to overcome this, particularly as all R2 reads will begin with the restriction site (i.e. fragments will terminate with this as there is no R2 barcode).

plotBaseQuals(rawFqc, plotType = "boxplot")
*Base qualities before trimming*

Base qualities before trimming

plotBaseQuals(trimFqc, plotType = "boxplot")
*Base qualities after trimming*

Base qualities after trimming

Read Totals

list(
  readTotals(rawFqc) %>% mutate(Type = "Raw"),
  readTotals(trimFqc) %>% mutate(Type = "Trimmed")
) %>%
  bind_rows() %>%
  spread(key = "Type", value = "Total_Sequences") %>%
  mutate(Library = str_remove(Filename, "_[12].fq.gz")) %>%
  distinct(Library, .keep_all = TRUE) %>%
  dplyr::select(Library, Raw, Trimmed) %>%
  mutate(
    Retained = percent(Trimmed / Raw),
    Discarded = percent((Raw - Trimmed) / Raw)
  ) %>%
  arrange(Discarded) %>%
  bind_rows(
    tibble(
      Library = "Total",
      Raw = sum(.$Raw),
      Trimmed = sum(.$Trimmed)
    ) %>%
      mutate(
            Retained = percent(Trimmed / Raw),
            Discarded = percent((Raw - Trimmed) / Raw)
      )
  ) %>%
    pander(
      big.mark = ",",
      justify = "lrrrr",
      style = "rmarkdown",
      split.tables = Inf,
      emphasize.strong.rows = 8,
      caption = "*Results from adapter removal. Reads < 70bp after trimming were discarded during this process*"
    )
Results from adapter removal. Reads < 70bp after trimming were discarded during this process
Library Raw Trimmed Retained Discarded
FCC229TACXX-CHKPEI13070001_L3 56,981,567 55,762,854 97.861% 2.139%
FCC2GPDACXX-CHKPEI13070007_L6 66,601,007 64,940,945 97.507% 2.493%
FCC2GPDACXX-CHKPEI13070004_L2 78,411,565 76,201,274 97.181% 2.819%
FCC2GPDACXX-CHKPEI13070006_L4 70,874,412 68,811,191 97.089% 2.911%
FCC21WPACXX-CHKPEI13070003_L7 65,481,942 63,542,676 97.038% 2.962%
FCC2GPDACXX-CHKPEI13070005_L3 78,445,394 76,021,945 96.911% 3.089%
FCC21WPACXX-CHKPEI13070002_L6 69,729,067 67,032,866 96.133% 3.867%
Total 486,524,954 472,313,751 97% 3%

The first check after demultiplexing is to ensure that read were not assigned to multiple individuals by sabre pe, given that one mismatch was permitted per barcode/restriction-site. Read Totals before and after demultiplexing were then checked and the recovery rate was >93% for each library, with the clear exception of FCC21WPACXX-CHKPEI13070002_L6. Manual inspection revealed that a significant majority (~9.3x106 of 13.4x106) of these non-recovered reads contained truncated barcodes with the first two bases missing. If an additional recovery step was taken this would increase the recovery rate to 93% for this library, and may yield an additional 5-600,000 reads per sample. For the next most poorly recovered library, an additional step may recover an additional 3x106 reads. However, no further action was taken at this point.

trimReadTotals <- readTotals(trimFqc) %>%
    mutate(Library = str_remove_all(Filename, "_[12].fq.gz")) %>%
    distinct(Library, Total_Sequences)
deMuxReadTotals <- readTotals(deMuxFqc) %>%
    mutate(ID = str_remove_all(Filename, ".[12].fq.gz")) %>%
    distinct(ID, Total_Sequences) %>%
    left_join(samples, by = "ID") %>%
    group_by(Library) %>%
    summarise(DeMultiplexed = sum(Total_Sequences)) %>%
    filter(!is.na(Library))
trimReadTotals %>%
  left_join(deMuxReadTotals) %>%
  mutate(`Recovery Rate` = DeMultiplexed / Total_Sequences) %>%
  dplyr::select(Library, everything()) %>%
  dplyr::rename(`Total Sequences` = Total_Sequences) %>%
  mutate(`Recovery Rate` = percent(`Recovery Rate`)) %>%
  arrange(`Recovery Rate`) %>%
  bind_rows(
    tibble(
      Library = "Total",
      `Total Sequences` = sum(.$`Total Sequences`),
      DeMultiplexed = sum(.$DeMultiplexed)
    ) %>%
      mutate(
        `Recovery Rate` = percent(DeMultiplexed / `Total Sequences`)
      ) 
  ) %>%
  pander(
    split.tables = Inf, 
    big.mark = ",",
    justify = "lrrr",
    style = "rmarkdown",
    emphasize.strong.rows = 8,
    caption = "*Recovery rate from demultiplexing the 1996 and 2012 samples, after adapter removal*"
  ) 
Recovery rate from demultiplexing the 1996 and 2012 samples, after adapter removal
Library Total Sequences DeMultiplexed Recovery Rate
FCC21WPACXX-CHKPEI13070002_L6 67,032,866 53,568,147 79.913%
FCC2GPDACXX-CHKPEI13070005_L3 76,021,945 70,702,537 93.003%
FCC2GPDACXX-CHKPEI13070006_L4 68,811,191 66,677,690 96.899%
FCC229TACXX-CHKPEI13070001_L3 55,762,854 55,037,971 98.700%
FCC21WPACXX-CHKPEI13070003_L7 63,542,676 62,787,380 98.811%
FCC2GPDACXX-CHKPEI13070004_L2 76,201,274 75,316,445 98.839%
FCC2GPDACXX-CHKPEI13070007_L6 64,940,945 64,349,296 99.089%
Total 472,313,751 448,439,466 95%
cp <- paste("*Read Totals for each of the 1996 & 2012 samples.",
            "The black line indicates the mean library size,",
            "whilst the dashed green line indicates the mean library size based",
            "on the original library before demultiplexing.",
            "Bar colours indicate sample population as 1996 (blue) or 2012 (red).*")
plotly::ggplotly(
  deMuxFqc %>%
    magrittr::extract(grepl("(ora|gc).+1.fq.gz", fqName(.))) %>%
    readTotals() %>%
    mutate(ID = str_remove(Filename, ".1.fq.gz")) %>%
    left_join(samples) %>%
    mutate(Population = case_when(
      grepl("gc", ID) ~ "1996",
      grepl("ora", ID) ~ "2012"
    )) %>%
    ggplot(aes(ID, Total_Sequences, fill = Population)) +
    geom_bar(stat = "identity") +
    geom_hline(
      aes(yintercept = mn),
      data = . %>% summarise(mn = mean(Total_Sequences))
    ) +
    geom_hline(
      aes(yintercept = mn),
      data = . %>% group_by(Library) %>% summarise(mn = mean(Total_Sequences)),
      colour = "green",
      linetype = 2
    ) +
    facet_wrap(~Library, scales = "free_x", nrow = 2) +
    scale_y_continuous(labels = comma) +
    scale_fill_manual(values = c(rgb(0.1, 0.1, 0.7), rgb(0.7, 0.1, 0.1))) +
    labs(x = c(), y = c()) +
    theme_bw() +
    theme(
      axis.text.x = element_text(angle = 90),
      legend.position = "none"
    )
)

Read Totals for each of the 1996 & 2012 samples. The black line indicates the mean library size, whilst the dashed green line indicates the mean library size based on the original library before demultiplexing. Bar colours indicate sample population as 1996 (blue) or 2012 (red).

In summary, of the 486,524,954 initial reads, a total of 448,439,466 were retained after trimming and demultiplexing. This represents a combined recovery rate of 92% from the initial libraries.

Comparison against previous analysis

The revised strategy showed considerable improvement in yield for the libraries FCC21WPACXX-CHKPEI13070002_L6 and FCC21WPACXX-CHKPEI13070003_L7 which both contained exculsively 1996 samples. As such the revised strategy was pursued further.

list(
    origFqc %>%
        readTotals() %>%
        mutate(ID = str_remove(Filename, ".Lib..[12].fq"),
               Type = "Previous"),
    deMuxFqc %>%
        readTotals() %>%
        mutate(ID = str_remove(Filename, ".[12].fq.gz"),
               Type = "Repeat")
) %>%
    bind_rows() %>%
    distinct(ID, Type, Total_Sequences) %>%
    dplyr::select(ID, Type, Total_Sequences) %>%
    filter(grepl("(gc|ora)", ID)) %>%
    mutate(Population = case_when(
            grepl("gc", ID) ~ "1996",
            grepl("ora", ID) ~ "2012"
        )) %>%
    left_join(samples) %>%
    ggplot(aes(Library, Total_Sequences, fill = Type)) +
    geom_boxplot() +
    labs(y = "Total Reads") +
    scale_y_continuous(labels = comma) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
*Comparison of library sizes after demultiplexing using sabre (blue), vs the original method using stacks (salmon).*

Comparison of library sizes after demultiplexing using sabre (blue), vs the original method using stacks (salmon).

list(
    origFqc %>%
        readTotals() %>%
        mutate(ID = str_remove(Filename, ".Lib..[12].fq"),
               Type = "Previous"),
    deMuxFqc %>%
        readTotals() %>%
        mutate(ID = str_remove(Filename, ".[12].fq.gz"),
               Type = "Repeat")
) %>%
    bind_rows() %>%
    distinct(ID, Type, Total_Sequences) %>%
    dplyr::select(ID, Type, Total_Sequences) %>%
    filter(grepl("(gc|ora)", ID)) %>%
    spread("Type", "Total_Sequences") %>%
    mutate(Change = Repeat / Previous,
           Population = case_when(
            grepl("gc", ID) ~ "1996",
            grepl("ora", ID) ~ "2012"
        )) %>%
    left_join(samples) %>%
    ggplot(aes(ID, Change, fill = Population)) +
    geom_bar(stat = "identity") +
    geom_hline(yintercept = 1, linetype = 2) +
    facet_wrap(~Library, scales = "free_x", nrow = 2) +
    scale_y_continuous(labels = percent) +
    scale_fill_manual(values = c(rgb(0.1, 0.1, 0.7, 0.8), rgb(0.7, 0.1, 0.1, 0.8))) +
    labs(x = c(), y = "% Improvement") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90),
          legend.position = "none")
*Changes in library size using the improved recovery strategy. All samples showed an improved rate of recovery.*

Changes in library size using the improved recovery strategy. All samples showed an improved rate of recovery.

list(
    origFqc %>%
        magrittr::extract(!grepl("(ora|gc)", fqName(.))) %>%
        readTotals(),
    deMuxFqc %>%
        magrittr::extract(grepl("1.fq", fqName(.))) %>%
        readTotals()
) %>%
    bind_rows() %>%
    mutate(Population = case_when(
        grepl("ora", Filename) ~ 2012,
        grepl("gc", Filename) ~ 1996,
        !grepl("(ora|gc)", Filename) ~ 2010
    ),
    Population = as.factor(Population)) %>%
    mutate(ID = str_remove(Filename, ".[12].fq.gz")) %>%
    ggplot(aes(Population, Total_Sequences, fill = Population)) +
    geom_boxplot() +
    scale_y_continuous(labels = comma) +
    theme_bw()
*Comparison of library sizes for the three populations.*

Comparison of library sizes for the three populations.

GC Content

deMuxFqc %>%
    magrittr::extract(grepl("(ora|gc)", fqName(.))) %>%
    plotGcContent(plotType = "line", usePlotly = TRUE, theoreticalGC = FALSE)

GC content for all 1996 and 2012 samples.

Inspection by GC content showed that gc2709 and gc2700 appeared to have an exaggerated peak around 60%, whilst all other samples showed a more broad spread across the range. From the Turretfield samples (previous analysis), pt1125 showed an unexpected peak around 50% indicating that sample may contain reads from a different species. This sample should be excluded from all further analysis.

origFqc %>%
    magrittr::extract(!grepl("(ora|gc)", fqName(.))) %>%
    plotGcContent(plotType = "line", usePlotly = TRUE, theoreticalGC = FALSE)

GC content for all Turretfield samples.

Read Lengths

deMuxFqc %>%
    plotSeqLengthDistn(usePlotly = TRUE, dendrogram = TRUE)

Length Distribution for all files

Sequence Content

plotSeqContent(deMuxFqc, usePlotly = TRUE, dendrogram = TRUE, cluster = TRUE)

Sequence content showing the presence of the RS in both the Turretfield and R2 samples.

Conclusion

  • The restriction site should be removed from both R2 and Turretfield samples, especially given the high error rate at positions 2/3 in some R2 samples
  • No further processing is required as the presence of the restriction site at both ends means fixed length reads are not required for Stacks

SessionInfo

pander(sessionInfo())

R version 3.6.2 (2019-12-12)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.3), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.0), tidyverse(v.1.3.0), pander(v.0.6.3), scales(v.1.1.0), magrittr(v.1.5), ngsReports(v.1.1.2), tibble(v.2.1.3), ggplot2(v.3.2.1) and BiocGenerics(v.0.32.0)

loaded via a namespace (and not attached): colorspace(v.1.4-1), hwriter(v.1.3.2), ellipsis(v.0.3.0), XVector(v.0.26.0), GenomicRanges(v.1.38.0), ggdendro(v.0.1-20), fs(v.1.3.1), rstudioapi(v.0.10), farver(v.2.0.1), ggrepel(v.0.8.1), lubridate(v.1.7.4), xml2(v.1.2.2), codetools(v.0.2-16), leaps(v.3.0), knitr(v.1.26), zeallot(v.0.1.0), jsonlite(v.1.6), Rsamtools(v.2.2.1), Cairo(v.1.5-10), broom(v.0.5.2), cluster(v.2.1.0), dbplyr(v.1.4.2), shiny(v.1.4.0), compiler(v.3.6.2), httr(v.1.4.1), backports(v.1.1.5), fastmap(v.1.0.1), assertthat(v.0.2.1), Matrix(v.1.2-18), lazyeval(v.0.2.2), cli(v.1.1.0), later(v.1.0.0), htmltools(v.0.4.0), tools(v.3.6.2), gtable(v.0.3.0), glue(v.1.3.1), GenomeInfoDbData(v.1.2.2), reshape2(v.1.4.3), FactoMineR(v.2.0), ShortRead(v.1.44.0), Rcpp(v.1.0.3), Biobase(v.2.46.0), cellranger(v.1.1.0), vctrs(v.0.2.0), Biostrings(v.2.54.0), nlme(v.3.1-143), crosstalk(v.1.0.0), xfun(v.0.11), rvest(v.0.3.5), mime(v.0.7), lifecycle(v.0.1.0), zlibbioc(v.1.32.0), MASS(v.7.3-51.4), zoo(v.1.8-6), promises(v.1.1.0), hms(v.0.5.2), SummarizedExperiment(v.1.16.0), RColorBrewer(v.1.1-2), yaml(v.2.2.0), latticeExtra(v.0.6-28), stringi(v.1.4.3), highr(v.0.8), S4Vectors(v.0.24.0), BiocParallel(v.1.20.0), truncnorm(v.1.0-8), GenomeInfoDb(v.1.22.0), rlang(v.0.4.2), pkgconfig(v.2.0.3), matrixStats(v.0.55.0), bitops(v.1.0-6), evaluate(v.0.14), lattice(v.0.20-38), GenomicAlignments(v.1.22.1), htmlwidgets(v.1.5.1), labeling(v.0.3), tidyselect(v.0.2.5), plyr(v.1.8.4), R6(v.2.4.1), IRanges(v.2.20.1), generics(v.0.0.2), DelayedArray(v.0.12.0), DBI(v.1.0.0), pillar(v.1.4.2), haven(v.2.2.0), withr(v.2.1.2), scatterplot3d(v.0.3-41), RCurl(v.1.95-4.12), modelr(v.0.1.5), crayon(v.1.3.4), plotly(v.4.9.1), rmarkdown(v.1.18), grid(v.3.6.2), readxl(v.1.3.1), data.table(v.1.12.6), reprex(v.0.3.0), digest(v.0.6.23), flashClust(v.1.01-2), webshot(v.0.5.2), xtable(v.1.8-4), httpuv(v.1.5.2), stats4(v.3.6.2), munsell(v.0.5.0), viridisLite(v.0.3.0) and kableExtra(v.1.1.0)